library(rtracklayer)
library(readxl)
library(readr)
library(dplyr)
library(stringr)
library(tidyr)
library(UpSetR)
library(genpwr)
library(ggplot2)
library(ggman)
library(microshades)We ran a conditional and joint analysis using GCTA to refine the list of independent loci.
Variants from Wray et al 2018, Howard et al 2019 (remove first and last rows with table captions), and Levey et al 2021, Giannakopoulou et al 2021, and the GWAS catalog for unipolar depression. Parse out regions from Wray and Howard and load queried regions for other results
tags <- read_table(snakemake@input$tags)##
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## SNP = col_character(),
## CHR = col_double(),
## BP = col_double(),
## NTAG = col_double(),
## LEFT = col_double(),
## RIGHT = col_double(),
## KBSPAN = col_double(),
## TAGS = col_character()
## )
wray <- read_tsv(snakemake@input$wray) %>%
separate(`Region (Mb)`, into=c('range.left.Mb', 'range.right.Mb'), sep='–', convert=TRUE) %>%
mutate(range.left=range.left.Mb*1e6, range.right=range.right.Mb*1e6)## Rows: 44 Columns: 11
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): Region (Mb), SNP, P, A1/A2, Prev., Gene context
## dbl (4): Chr., OR (A1), s.e. (log(OR)), Freq.
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
howard <- read_excel(snakemake@input$howard, skip=2, n_max=102) %>%
separate(`Region (bp) of clumped variants (P < 10-4) in linkage disequilibrium (r2 > 0.1) with lead variant`, into=c('range.left', 'range.right'), sep='-', convert=TRUE)## New names:
## • `Odds Ratio` -> `Odds Ratio...8`
## • `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...9`
## • `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...10`
## • `Log(Odds Ratio)` -> `Log(Odds Ratio)...11`
## • `Standard error of the Log(Odds Ratio)` -> `Standard error of the Log(Odds Ratio)...12`
## • `P-value` -> `P-value...13`
## • `Odds Ratio` -> `Odds Ratio...14`
## • `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...15`
## • `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...16`
## • `Log(Odds Ratio)` -> `Log(Odds Ratio)...17`
## • `Standard error of the Log(Odds Ratio)` -> `Standard error of the Log(Odds Ratio)...18`
## • `P-value` -> `P-value...19`
## • `Odds Ratio` -> `Odds Ratio...20`
## • `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...21`
## • `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...22`
## • `Log(Odds Ratio)` -> `Log(Odds Ratio)...23`
## • `Standard error of the Log(Odds Ratio)` -> `Standard error of the Log(Odds Ratio)...24`
## • `P-value` -> `P-value...25`
## • `Odds Ratio` -> `Odds Ratio...26`
## • `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...27`
## • `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...28`
## • `Log(Odds Ratio)` -> `Log(Odds Ratio)...29`
## • `Standard error of the Log(Odds Ratio)` -> `Standard error of the Log(Odds Ratio)...30`
## • `P-value` -> `P-value...31`
## • `Direction` -> `Direction...32`
## • `Odds Ratio` -> `Odds Ratio...33`
## • `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...34`
## • `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...35`
## • `Log(Odds Ratio)` -> `Log(Odds Ratio)...36`
## • `Standard error of the Log(Odds Ratio)` -> `Standard error of the Log(Odds Ratio)...37`
## • `P-value` -> `P-value...38`
## • `Odds Ratio` -> `Odds Ratio...39`
## • `Lower 95% Confidence Interval` -> `Lower 95% Confidence Interval...40`
## • `Upper 95% Confidence Interval` -> `Upper 95% Confidence Interval...41`
## • `Log(Odds Ratio)` -> `Log(Odds Ratio)...42`
## • `Standard error of the Log(Odds Ratio)` -> `Standard error of the Log(Odds Ratio)...43`
## • `P-value` -> `P-value...44`
## • `Direction` -> `Direction...45`
levey <- read_tsv(snakemake@input$levey, col_types=cols(CHR.BP=col_character())) %>%
left_join(select(tags, SNP, LEFT, RIGHT), by=c('rsid'='SNP')) %>%
mutate(LEFT=if_else(is.na(LEFT), true=BP, false=LEFT),
RIGHT=if_else(is.na(RIGHT), true=BP, false=RIGHT))
giannakopoulou <- read_tsv(snakemake@input$giannakopoulou, col_types=cols('CHR:POS'=col_character())) %>%
separate(`CHR:POS`, into=c('CHR', 'POS'), convert=TRUE) %>%
left_join(select(tags, SNP, LEFT, RIGHT), by='SNP') %>%
mutate(LEFT=if_else(is.na(LEFT), true=as.numeric(POS), false=LEFT),
RIGHT=if_else(is.na(RIGHT), true=as.numeric(POS), false=RIGHT))
gwas_catalog <- read_tsv(snakemake@input$catalog) %>%
filter(!is.na(CHR_ID)) %>%
mutate(CHR=if_else(CHR_ID == 'X', true=23, false=as.numeric(CHR_ID)),
POS=as.numeric(CHR_POS),
SNP=paste0('rs', SNP_ID_CURRENT)) %>%
filter(!is.na(POS)) %>%
left_join(select(tags, SNP, LEFT, RIGHT), by='SNP') %>%
mutate(LEFT=if_else(is.na(LEFT), true=POS, false=LEFT),
RIGHT=if_else(is.na(RIGHT), true=POS, false=RIGHT))## Rows: 2391 Columns: 38
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (28): FIRST AUTHOR, JOURNAL, LINK, STUDY, DISEASE/TRAIT, INITIAL SAMPLE...
## dbl (8): PUBMEDID, UPSTREAM_GENE_DISTANCE, DOWNSTREAM_GENE_DISTANCE, MERGE...
## date (2): DATE ADDED TO CATALOG, DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning in replace_with(out, !condition, false, "`false`", "length of
## `condition`"): NAs introduced by coercion
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
List of clumped and COJO SNPs and regions
Load list of COJO SNPs (with multiple or single GW-sig SNPs)
cojo_mult <- read_tsv(snakemake@input$cojo)## Rows: 556 Columns: 28
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): SNP, A1, A2, Direction
## dbl (24): region, snp_idx, CHR, BP, FRQ_A_525197, FRQ_U_3362335, INFO, OR, S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cojo_single <- read_tsv(snakemake@input$singleton)## Rows: 66 Columns: 28
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): SNP, A1, A2, Direction
## dbl (24): region, snp_idx, CHR, BP, FRQ_A_525197, FRQ_U_3362335, INFO, OR, S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cojo <- bind_rows(cojo_mult, cojo_single) %>% arrange(CHR, BP)Clumped results from Ricopili
rp <- read_table(snakemake@input$rp_clump) %>% filter(P <= 5e-8)##
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## SNP = col_character(),
## A1A2 = col_character(),
## `(Nca,Nco,Neff)Dir` = col_character(),
## `LD-friends(0.1).p0.001` = col_character(),
## `LD-friends(0.6).p0.001` = col_character(),
## gwas_catalog_span.6 = col_character(),
## `genes.6.50kb(dist2index)` = col_character(),
## N.genes.6.50kb = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
Construct genomic range objects so that SNPs and regions can be intersected and compared.
cojo_gr <- with(cojo, GRanges(seqnames=CHR, ranges=IRanges(start=range.left, end=range.right), SNP=SNP))
rp_gr <- with(rp, GRanges(seqnames=CHR, ranges=IRanges(start=range.left, end=range.right), SNP=SNP))
wray_gr <- with(wray, GRanges(seqnames=Chr., ranges=IRanges(start=range.left, end=range.right, SNP=SNP)))
howard_gr <- with(howard, GRanges(seqnames=Chromosome, ranges=IRanges(start=range.left, end=range.right, SNP=`Marker Name`)))
levey_gr <- with(levey, GRanges(seqnames=CHR, ranges=IRanges(start=LEFT, end=RIGHT)))
giannakopoulou_gr <- with(giannakopoulou, GRanges(seqnames=CHR, ranges=IRanges(start=LEFT, end=RIGHT)))
gwas_catalog_gr <- with(gwas_catalog, GRanges(seqnames=CHR, ranges=IRanges(start=LEFT, end=RIGHT)))Tally entries in the GWAS catalog for each region
parse_catalog_entry <- function(catalog_entry) {
# extract LD and RSID from first element
ld_snp_match <- str_match(catalog_entry[1], "\\((.+)\\)(rs[[:digit:]]+)")
ld <- as.numeric(ld_snp_match[,2])
snp <- ld_snp_match[,3]
# parse rest of elements into phenotype(PubMed ID)(P-value)
phenotype_matches <- str_match(catalog_entry[-1], "(.+)\\(([[:digit:]]+)\\)\\((.+)\\)")
phenotypes <- phenotype_matches[,2]
pubmeds <- as.numeric(phenotype_matches[,3])
p_values <- as.numeric(phenotype_matches[,4])
return(data.frame(ld, catalogSNP=snp, phenotype=phenotypes, pubmed_id=pubmeds, P=p_values))
}
rp_gwas_catalog_entries <-
plyr::ddply(filter(rp, gwas_catalog_span.6 != '-'), ~SNP, function(rp_entry) {
# get GWAS catalog cell
gwas_catalog <- rp_entry$gwas_catalog_span.6
# split out into catalog entries (separated by /, removing first empty element)
catalog_entries <- str_split(gwas_catalog, "/")[[1]]
catalog_entries_complete <- catalog_entries[which(catalog_entries != "")]
# split SNP entries by ";"
catalog_entries_list <- str_split(catalog_entries_complete, ";")
# parse each entry
catalog_entries_df <- plyr::ldply(catalog_entries_list, parse_catalog_entry)
return(catalog_entries_df)
}) %>% as_tibble()rp_genes_dist <-
plyr::ddply(filter(rp, `genes.6.50kb(dist2index)` != '-'), ~SNP, function(rp_entry) {
genes_dist <- rp_entry$`genes.6.50kb(dist2index)`
genes_dist_list <- str_split(genes_dist, ',')[[1]]
genes_dist_match <- str_match(genes_dist_list, "(.+)\\((.+)\\)")
return(data.frame(gene=genes_dist_match[,2], dist2index=as.numeric(genes_dist_match[,3])))
}) %>% as_tibble()Find overlaps between clumped, COJO, and previous results. Append and then reduce all regions
all_gr <- reduce(c(cojo_gr, rp_gr, wray_gr, howard_gr, levey_gr, giannakopoulou_gr, gwas_catalog_gr))Find overlaps and make lists for an upset plot. Take the index from the combined set as the element, then find which of them are found within each of the sets of SNPs. The upset plot function handles finding each combination of intersections.
hits_upset <- list(MDD3_COJO=unique(findOverlaps(all_gr, cojo_gr)@from),
MDD3_Clump=unique(findOverlaps(all_gr, rp_gr)@from),
Wray=unique(findOverlaps(all_gr, wray_gr)@from),
Howard=unique(findOverlaps(all_gr, howard_gr)@from),
Levey=unique(findOverlaps(all_gr, levey_gr)@from),
Giannakopoulou=unique(findOverlaps(all_gr, giannakopoulou_gr)@from),
Catalog=unique(findOverlaps(all_gr, gwas_catalog_gr)@from))
upset(fromList(hits_upset), nsets=7, order.by='freq', text.scale=2)Find which COJO regions overlap with Howard
cojo_howard_overlaps <- findOverlaps(cojo_gr, howard_gr)
cojo_howard_overlaps## Hits object with 117 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 6 2
## [3] 7 3
## [4] 10 4
## [5] 11 4
## ... ... ...
## [113] 583 97
## [114] 587 98
## [115] 595 100
## [116] 597 101
## [117] 612 102
## -------
## queryLength: 622 / subjectLength: 102
Count number of regions in Howard that overlap:
howard %>% slice(unique(cojo_howard_overlaps@to)) %>% count()## # A tibble: 1 × 1
## n
## <int>
## 1 90
Find which COJO regions overlap with Levey
cojo_levey_overlaps <- findOverlaps(cojo_gr, levey_gr)
cojo_levey_overlaps## Hits object with 265 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 110
## [2] 6 2
## [3] 7 65
## [4] 10 77
## [5] 10 26
## ... ... ...
## [261] 600 154
## [262] 601 154
## [263] 608 120
## [264] 612 153
## [265] 613 211
## -------
## queryLength: 622 / subjectLength: 223
Count number of regions in Levey that overlap:
levey %>% slice(unique(cojo_levey_overlaps@to)) %>% count()## # A tibble: 1 × 1
## n
## <int>
## 1 196
Overlaps with previous findings.
cojo_known_overlaps <- findOverlaps(cojo_gr, reduce(c(wray_gr, howard_gr, levey_gr, giannakopoulou_gr, gwas_catalog_gr)))
cojo_known <- cojo %>% slice(unique(cojo_known_overlaps@from))
catalog_known <- rp_gwas_catalog_entries %>% filter(SNP %in% cojo_known$SNP) %>% count(phenotype) %>% arrange(desc(n))
catalog_known## # A tibble: 201 × 2
## phenotype n
## <chr> <int>
## 1 Serum_metabolit... 186
## 2 Schizophrenia 64
## 3 Intelligence_(MTAG) 61
## 4 Waist_circumfer... 44
## 5 Neuroticism 32
## 6 Autism_spectrum... 30
## 7 Trans_fatty_acid_levels 29
## 8 Depression_(broad) 28
## 9 Glycerophosphol... 27
## 10 Depressive_symp... 24
## # … with 191 more rows
Newly discovered regions
cojo_new <- cojo %>% slice(unique(-cojo_known_overlaps@from)) %>% arrange(P)
cojo_new %>%
select(region, snp_idx, CHR, SNP, BP, P, pJ) %>%
group_by(region)## # A tibble: 309 × 7
## # Groups: region [305]
## region snp_idx CHR SNP BP P pJ
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 174 1 5 rs1993739 153215007 5.71e-20 5.72e-20
## 2 362 1 12 rs2363585 60791165 1.69e-16 1.69e-16
## 3 310 1 10 rs12778915 77617557 2.35e-16 2.35e-16
## 4 237 1 7 rs12666306 115082406 1.38e-15 1.39e-15
## 5 395 1 14 rs17100626 33772933 2.05e-15 2.06e-15
## 6 318 1 10 rs3808964 125426627 7.13e-15 7.14e-15
## 7 101 1 3 rs116310555 71814431 4.24e-14 4.24e-14
## 8 392 1 13 rs9524024 94022948 8.23e-14 8.24e-14
## 9 136 1 4 rs13134858 115522306 9.96e-14 9.97e-14
## 10 355 1 12 rs7973992 39168233 1.20e-13 1.21e-13
## # … with 299 more rows
rp_gwas_catalog_entries %>% filter(SNP %in% cojo_new$SNP) %>% count(phenotype) %>% arrange(desc(n)) %>% filter(!phenotype %in% catalog_known$phenotype)## # A tibble: 50 × 2
## phenotype n
## <chr> <int>
## 1 Body_mass_index_(adult) 3
## 2 Monocyte_count 3
## 3 Pulse_pressure 3
## 4 Blood_osmolalit... 2
## 5 Blood_pressure 2
## 6 Mean_corpuscular_volume 2
## 7 Survival_in_col... 2
## 8 Adiposity 1
## 9 Aspirin_hydroly... 1
## 10 Bladder_cancer 1
## # … with 40 more rows
rp_genes_dist %>% filter(SNP %in% cojo_new$SNP) %>% group_by(SNP) %>% filter(dist2index == min(dist2index)) %>% ungroup() %>% select(gene) %>% distinct()## # A tibble: 310 × 1
## gene
## <chr>
## 1 CDH18
## 2 NTRK3
## 3 GTF2IRD1
## 4 NXPH1
## 5 MAGI2
## 6 ETV6
## 7 AQP12A
## 8 SHTN1
## 9 VAX1
## 10 MIR3663HG
## # … with 300 more rows
Calculate power for previous versus current GWAS
# use sum of effective sample sizes and case rate of 50% rather than sum of cases and controls
wray_power <- genpwr.calc(calc='es', model='logistic', ge.interaction=NULL, N=405961, Case.Rate=0.5, k=NULL, MAF=seq(0.005, 0.49, by=0.005), Power=0.8, Alpha=5e-8, True.Model='Additive', Test.Model='Additive')
levey_power <- genpwr.calc(calc='es', model='logistic', ge.interaction=NULL, N=938012, Case.Rate=0.5, k=NULL, MAF=seq(0.005, 0.49, by=0.005), Power=0.8, Alpha=5e-8, True.Model='Additive', Test.Model='Additive')
mdd2022_power <- genpwr.calc(calc='es', model='logistic', ge.interaction=NULL, N=2*776935, Case.Rate=0.5, k=NULL, MAF=seq(0.005, 0.49, by=0.005), Power=0.8, Alpha=5e-8, True.Model='Additive', Test.Model='Additive')frq_u_col <- str_subset(names(cojo), 'FRQ_U')
cojo_known_novel <- bind_rows(
mutate(cojo_known, Association='Known'),
mutate(cojo_new, Association='Novel')) %>%
mutate(BETA=log(OR)) %>%
select(SNP, Association, OR, BETA, FRQ=starts_with('FRQ_U')) %>%
mutate(MAF=if_else(FRQ <= 0.5, true=FRQ, false=1-FRQ))
power_known_novel <- bind_rows(
transmute(levey_power, Power='Levey', MAF, OR=`OR_at_Alpha_5e-08`),
transmute(mdd2022_power, Power='MDD3', MAF, OR=`OR_at_Alpha_5e-08`)
)
ggplot(cojo_known_novel, aes(x=MAF, y=exp(abs(BETA)))) +
geom_point(aes(color=Association)) +
geom_line(mapping=aes(y=OR, linetype=Power), data=power_known_novel, size=1) +
scale_y_continuous('OR', limits=c(1, 1.1))## Warning: Removed 2 row(s) containing missing values (geom_path).
daner <- read_tsv(snakemake@input$daner)## Rows: 7131733 Columns: 20
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): SNP, A1, A2, Direction
## dbl (16): CHR, BP, FRQ_A_525197, FRQ_U_3362335, INFO, OR, SE, P, ngt, HetISq...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gwas <- daner %>% filter(-log10(P)>=3) %>% mutate(CHR=if_else(CHR!=23, true=as.character(CHR), false='X'))
manhattn <- ggman(as.data.frame(gwas), snp = "SNP", bp = "BP", chrom = "CHR", pvalue = "P", ymin=3, lineColour='#111111') +
theme_minimal() +
theme(panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank())## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
manhattnmanhattn + scale_colour_manual(values=c('#ca0020', '#92c5de'))manhattn + scale_colour_manual(values=microshades_palette('micro_cvd_blue', 2, lightest = FALSE))Format COJO results for supplementary table
cojo_format <-
cojo %>%
arrange(region) %>%
select(Locus=region, SNP.Index=snp_idx, SNP, CHR, BP, A1, A2,
FRQ_A=starts_with('FRQ_A'), FRQ_U=starts_with('FRQ_U'),
INFO, OR, SE, P, ngt, Direction,
HetISqt, HetPVa, Nca, Nco, Neff_half,
bJ, bJ_se, pJ, LD_r,
Locus.Left=range.left, Locus.Right=range.right, N.Sig.SNPs=n_sig_snps)
write_tsv(cojo_format, "docs/tables/meta_snps_full_eur.cojo.format.txt")